import pickle
with open('result_list.pickle','rb') as f:
data = pickle.load(f)
clean_d=[]
for i in data:
clean_d.append(i.replace('Within Set Sum of Squared Error = ','').replace('for k =','').split())
from matplotlib import pyplot as plt
x=[]
y=[]
for i in clean_d:
x.append(int(i[1]))
y.append(float(i[0]))
min(x)
fig, ax = plt.subplots()
ax.grid(True)
ax.plot(x,y)
plt.axis([min(x),max(x),min(y),max(y)])
plt.show()
from pyspark.mllib.clustering import KMeans, KMeansModel
sameModel = KMeansModel.load(sc, "hdfs:///user/cluster_model/cluster_model_k_17")
with open('success_vec_protocol_1.pickle','rb') as objectf:
data = pickle.load(objectf)
keys_rdd = sc.parallelize(list(data.keys()))
key_value_rdd = keys_rdd.map(lambda x: (x,data[x].reshape((4096,))))
array_rdd = key_value_rdd.map(lambda (k,v):v)
from scipy.spatial.distance import euclidean
distance = array_rdd.map(lambda point :(sameModel.predict(point),euclidean(point,sameModel.centers[sameModel.predict(point)])))
radius = distance.reduceByKey(max)
radius.collect()
rd_hash = array_rdd.map(lambda point :(sameModel.predict(point),point))
rd_group = rd_hash.groupByKey()
group = rd_group.collect()
for i in group:
print (i[0],len([j for j in i[1]]))
for i in group:
if i[0]==16:
d =[j for j in i[1]]
for idx,val in enumerate(d):
d[idx]=val.reshape((1,4096))
import random
test_d = d
#test_d = d
reverse_hash = key_value_rdd.map(lambda(k,v):(k,v.reshape((1,4096))))
t = reverse_hash.take(1)
t[0][1].shape
d[0].shape
import numpy as np
brod = sc.broadcast(test_d)
def fil(k,v):
for i in brod.value:
if np.array_equal(i,v):
return k
subset = reverse_hash.filter(lambda (k,v):fil(k,v)).collect()
len(subset)
subset[0][0]
img_dir = '/mnt/homes_img'
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.ndimage import imread
for i in subset:
test_img = imread(img_dir+"/"+i[0])
print (i[0])
plt.imshow(test_img)
plt.show()